Refactor WebsiteAgent#check_url.

Its function body is currently too large and deep, so divide it into
several methods.

Akinori MUSHA 10 lat temu
rodzic
commit
432f952b0a
1 zmienionych plików z 99 dodań i 77 usunięć
  1. 99 77
      app/models/agents/website_agent.rb

+ 99 - 77
app/models/agents/website_agent.rb

@@ -161,78 +161,46 @@ module Agents
161 161
               log "Storing new result for '#{name}': #{doc.inspect}"
162 162
               create_event :payload => doc
163 163
             end
164
-          else
165
-            output = {}
166
-            interpolated['extract'].each do |name, extraction_details|
167
-              case extraction_type
168
-              when "text"
169
-                regexp = Regexp.new(extraction_details['regexp'])
170
-                result = []
171
-                doc.scan(regexp) {
172
-                  result << Regexp.last_match[extraction_details['index']]
173
-                }
174
-                log "Extracting #{extraction_type} at #{regexp}: #{result}"
175
-              when "json"
176
-                result = Utils.values_at(doc, extraction_details['path'])
177
-                log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
178
-              else
179
-                case
180
-                when css = extraction_details['css']
181
-                  nodes = doc.css(css)
182
-                when xpath = extraction_details['xpath']
183
-                  doc.remove_namespaces! # ignore xmlns, useful when parsing atom feeds
184
-                  nodes = doc.xpath(xpath)
185
-                else
186
-                  error '"css" or "xpath" is required for HTML or XML extraction'
187
-                  return
188
-                end
189
-                case nodes
190
-                when Nokogiri::XML::NodeSet
191
-                  result = nodes.map { |node|
192
-                    case value = node.xpath(extraction_details['value'])
193
-                    when Float
194
-                      # Node#xpath() returns any numeric value as float;
195
-                      # convert it to integer as appropriate.
196
-                      value = value.to_i if value.to_i == value
197
-                    end
198
-                    value.to_s
199
-                  }
200
-                else
201
-                  error "The result of HTML/XML extraction was not a NodeSet"
202
-                  return
203
-                end
204
-                log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
205
-              end
206
-              output[name] = result
164
+            next
165
+          end
166
+
167
+          output =
168
+            case extraction_type
169
+            when 'json'
170
+              extract_json(doc)
171
+            when 'text'
172
+              extract_text(doc)
173
+            else
174
+              extract_xml(doc)
207 175
             end
208 176
 
209
-            num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
177
+          num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
210 178
 
211
-            if num_unique_lengths.length != 1
212
-              error "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
213
-              return
214
-            end
179
+          if num_unique_lengths.length != 1
180
+            raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
181
+          end
215 182
 
216
-            old_events = previous_payloads num_unique_lengths.first
217
-            num_unique_lengths.first.times do |index|
218
-              result = {}
219
-              interpolated['extract'].keys.each do |name|
220
-                result[name] = output[name][index]
221
-                if name.to_s == 'url'
222
-                  result[name] = (response.env[:url] + result[name]).to_s
223
-                end
183
+          old_events = previous_payloads num_unique_lengths.first
184
+          num_unique_lengths.first.times do |index|
185
+            result = {}
186
+            interpolated['extract'].keys.each do |name|
187
+              result[name] = output[name][index]
188
+              if name.to_s == 'url'
189
+                result[name] = (response.env[:url] + result[name]).to_s
224 190
               end
191
+            end
225 192
 
226
-              if store_payload!(old_events, result)
227
-                log "Storing new parsed result for '#{name}': #{result.inspect}"
228
-                create_event :payload => result
229
-              end
193
+            if store_payload!(old_events, result)
194
+              log "Storing new parsed result for '#{name}': #{result.inspect}"
195
+              create_event :payload => result
230 196
             end
231 197
           end
232 198
         else
233
-          error "Failed: #{response.inspect}"
199
+          raise "Failed: #{response.inspect}"
234 200
         end
235 201
       end
202
+    rescue => e
203
+      error e.message
236 204
     end
237 205
 
238 206
     def receive(incoming_events)
@@ -266,7 +234,7 @@ module Agents
266 234
             old_event.expires_at = new_event_expiration_date
267 235
             old_event.save!
268 236
             return false
269
-         end
237
+          end
270 238
         end
271 239
         return true
272 240
       end
@@ -305,27 +273,81 @@ module Agents
305 273
       end).to_s
306 274
     end
307 275
 
276
+    def extract_each(doc, &block)
277
+      interpolated['extract'].each_with_object({}) { |(name, extraction_details), output|
278
+        output[name] = block.call(extraction_details)
279
+      }
280
+    end
281
+
282
+    def extract_json(doc)
283
+      extract_each(doc) { |extraction_details|
284
+        result = Utils.values_at(doc, extraction_details['path'])
285
+        log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
286
+        result
287
+      }
288
+    end
289
+
290
+    def extract_text(doc)
291
+      extract_each(doc) { |extraction_details|
292
+        regexp = Regexp.new(extraction_details['regexp'])
293
+        result = []
294
+        doc.scan(regexp) {
295
+          result << Regexp.last_match[extraction_details['index']]
296
+        }
297
+        log "Extracting #{extraction_type} at #{regexp}: #{result}"
298
+        result
299
+      }
300
+    end
301
+
302
+    def extract_xml(doc)
303
+      extract_each(doc) { |extraction_details|
304
+        case
305
+        when css = extraction_details['css']
306
+          nodes = doc.css(css)
307
+        when xpath = extraction_details['xpath']
308
+          doc.remove_namespaces! # ignore xmlns, useful when parsing atom feeds
309
+          nodes = doc.xpath(xpath)
310
+        else
311
+          raise '"css" or "xpath" is required for HTML or XML extraction'
312
+        end
313
+        case nodes
314
+        when Nokogiri::XML::NodeSet
315
+          result = nodes.map { |node|
316
+            case value = node.xpath(extraction_details['value'])
317
+            when Float
318
+              # Node#xpath() returns any numeric value as float;
319
+              # convert it to integer as appropriate.
320
+              value = value.to_i if value.to_i == value
321
+            end
322
+            value.to_s
323
+          }
324
+        else
325
+          raise "The result of HTML/XML extraction was not a NodeSet"
326
+        end
327
+        log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
328
+        result
329
+      }
330
+    end
331
+
308 332
     def parse(data)
309 333
       case extraction_type
310
-        when "xml"
311
-          Nokogiri::XML(data)
312
-        when "json"
313
-          JSON.parse(data)
314
-        when "html"
315
-          Nokogiri::HTML(data)
316
-        when "text"
317
-          data
318
-        else
319
-          raise "Unknown extraction type #{extraction_type}"
334
+      when "xml"
335
+        Nokogiri::XML(data)
336
+      when "json"
337
+        JSON.parse(data)
338
+      when "html"
339
+        Nokogiri::HTML(data)
340
+      when "text"
341
+        data
342
+      else
343
+        raise "Unknown extraction type #{extraction_type}"
320 344
       end
321 345
     end
322 346
 
323 347
     def is_positive_integer?(value)
324
-      begin
325
-        Integer(value) >= 0
326
-      rescue
327
-        false
328
-      end
348
+      Integer(value) >= 0
349
+    rescue
350
+      false
329 351
     end
330 352
   end
331 353
 end